# -*- coding: utf-8 -*-
import feedparser
import os
import codecs
import re

def remove_symbols(data):
		p = re.compile(r'[\!@#$%&*()-_+="]')
		return p.sub('',data)
		
def remove_extra_spaces(data):
		p = re.compile(r'\s+')
		return p.sub(' ', data)

def remove_html_tags(data):
		p = re.compile(r'<.*?>')
		return p.sub('', data)
		
def remove_symbols_aux(data):
	p = re.compile(r'[-_&:,.;"$@!^*()[0-9]')
	aux = p.sub('',data)
	return aux.replace("quot",' ')


		
def clearTxt(data):
	
	aux = data
	aux = remove_extra_spaces(aux)
	aux = remove_html_tags(aux)
	aux = remove_symbols_aux(aux)	
	return aux

# workspace = "c:" + os.sep + "Users" + os.sep + "Jonathan" + os.sep + "Desktop" + os.sep + "data"
workspace = "data"
urlFeeds = {}

#------------- Deportes ------------------#
urlFeeds["Deportes"] = []
urlFeeds["Deportes"].append("http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=131")
urlFeeds["Deportes"].append("http://www.perfil.com/rss/deportes.xml")
urlFeeds["Deportes"].append("http://www.ambito.com/rss/noticias.asp?S=Deportes")
urlFeeds["Deportes"].append("http://ole.feedsportal.com/c/33068/f/577712/index.rss")
urlFeeds["Deportes"].append("http://www.infobae.com/rss/deportes.xml")
urlFeeds["Deportes"].append("http://www.cronica.com.ar/rss/deportes.xml")
urlFeeds["Deportes"].append("http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=9")
urlFeeds["Deportes"].append("http://www.canchallena.com/herramientas/rss")

#------------- Economia ------------------#
urlFeeds["Economia"] = []
urlFeeds["Economia"].append("http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=272")
urlFeeds["Economia"].append("http://www.perfil.com/rss/economia.xml")
urlFeeds["Economia"].append("http://www.ambito.com/rss/noticias.asp?S=Econom%EDa")
urlFeeds["Economia"].append("www.infobae.com/rss/economia.xml")
urlFeeds["Economia"].append("http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=5")
urlFeeds["Economia"].append("")

#------------- Cultura & Espectaculo ------------------#
urlFeeds["Espectaculos"] = []
urlFeeds["Espectaculos"].append("http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=1") #Cultura
urlFeeds["Espectaculos"].append("http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=120") #Espectaculo
urlFeeds["Espectaculos"].append("http://www.perfil.com/rss/cultura.xml") #Cultura
urlFeeds["Espectaculos"].append("http://www.perfil.com/rss/espectaculos.xml") #Espectaculo
urlFeeds["Espectaculos"].append("http://www.ambito.com/rss/noticias.asp?S=Espect%E1culos")
urlFeeds["Espectaculos"].append("http://www.infobae.com/rss/teleshow.xml")
urlFeeds["Espectaculos"].append("http://www.cronica.com.ar/rss/espectaculos.xml")
urlFeeds["Espectaculos"].append("http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=10")
urlFeeds["Espectaculos"].append("http://www.pagina12.com.ar/diario/rss/espectaculos.xml")

#------------- Policiales & Informacion General ------------------#
urlFeeds["Informacion General"] = []
urlFeeds["Informacion General"].append("http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=21")
urlFeeds["Informacion General"].append("http://www.perfil.com/rss/policia.xml")
urlFeeds["Informacion General"].append("http://www.ambito.com/rss/noticias.asp?S=Informaci%F3n%20General")
urlFeeds["Informacion General"].append("http://www.cronica.com.ar/rss/policiales.xml")

#------------- Internacional ------------------#
urlFeeds["Internacional"] = []
urlFeeds["Internacional"].append("http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=7")
urlFeeds["Internacional"].append("http://www.perfil.com/rss/internacional.xml")
urlFeeds["Internacional"].append("http://www.ambito.com/rss/noticias.asp?S=Internacionales")
urlFeeds["Internacional"].append("http://www.infobae.com/rss/mundo.xml")
urlFeeds["Internacional"].append("http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=7")

#------------- Politica ------------------#
urlFeeds["Politica"] = []
urlFeeds["Politica"].append("http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=30")
urlFeeds["Politica"].append("http://www.perfil.com/rss/politica.xml")
urlFeeds["Politica"].append("http://www.ambito.com/rss/noticias.asp?S=Pol%EDtica")
urlFeeds["Politica"].append("www.infobae.com/rss/politica.xml")
urlFeeds["Politica"].append("http://www.cronica.com.ar/rss/politica.xml")
urlFeeds["Politica"].append("http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=4")

#------------- Ciencia & Tecnologia ------------------#
urlFeeds["Ciencia y Tecnologia"] = []
urlFeeds["Ciencia y Tecnologia"].append("http://servicios.lanacion.com.ar/herramientas/rss/categoria_id=432")
urlFeeds["Ciencia y Tecnologia"].append("http://www.perfil.com/rss/tecnologia.xml")
urlFeeds["Ciencia y Tecnologia"].append("http://www.perfil.com/rss/ciencia.xml")
urlFeeds["Ciencia y Tecnologia"].append("http://www.ambito.com/rss/noticias.asp?S=Tecnolog%EDa")
urlFeeds["Ciencia y Tecnologia"].append("www.infobae.com/rss/tecnologia.xml")
urlFeeds["Ciencia y Tecnologia"].append("http://feeds.feedburner.com/fayerwayer")

newNewsCount = {}
newNewsCount["Deportes"] = 0
newNewsCount["Economia"] = 0
newNewsCount["Espectaculos"] = 0
newNewsCount["Informacion General"] = 0
newNewsCount["Internacional"] = 0
newNewsCount["Politica"] = 0
newNewsCount["Ciencia y Tecnologia"] = 0


for category in urlFeeds:
	for url in urlFeeds[category]:
		feed = feedparser.parse(url)
		for n in feed.entries:
			link = remove_symbols(n.link.encode("utf-8")) + ".txt"			
			if not (os.path.exists(workspace + os.sep + category + os.sep + link)):
				title = clearTxt(n.title)		
				summary = clearTxt(n.summary)		
				file = codecs.open(workspace + os.sep + category + os.sep + link,"w","utf-8")
				file.write(title + " " + summary)
				newNewsCount[category] = newNewsCount[category] + 1

print "Cantidad de nuevas noticias agregadas: "
print newNewsCount